https://www.statmethods.net/advstats/glm.html

# Load R packages into the library
# Data management packages
library(DescTools)
library(skimr)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(aod)
library(readxl)
# Visualization packages
#library(Deducer)
library(ggplot2)
# Machine learnning method packages
library(ROCR)
## Loading required package: gplots
## Registered S3 method overwritten by 'gdata':
##   method         from     
##   reorder.factor DescTools
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
################# 12 oct 2019 #########
# Import dataset # lending  Club

#https://www.kaggle.com/wendykan/lending-club-loan-data

#https://www.lendingclub.com/developers/listed-loans
setwd("~/Downloads/001.Analytics Adayar")
loan_data <- read.csv("loan.csv")
# Selecting the relevant variables in the dataset: 
# gouti1454: need to study the CSV file. check the columns and find the relavent data neeed to work with the model. 
# gouti1454 - here they have selected and given to practice.

loan_data <- loan_data[,c("grade","sub_grade","term","loan_amnt","issue_d","loan_status","emp_length",
                          "home_ownership", "annual_inc","verification_status","purpose","dti",
                          "delinq_2yrs","addr_state","int_rate", "inq_last_6mths","mths_since_last_delinq",
                          "mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc")]

table(loan_data$sub_grade)
## 
##     A1     A2     A3     A4     A5     B1     B2     B3     B4     B5     C1 
##  86790  69562  73184  95874 107617 125341 126621 131514 139793 140288 145903 
##     C2     C3     C4     C5     D1     D2     D3     D4     D5     E1     E2 
## 131116 129193 127115 116726  81787  72899  64819  56896  48023  33573  29924 
##     E3     E4     E5     F1     F2     F3     F4     F5     G1     G2     G3 
##  26708  22763  22671  13413   9305   7791   6124   5167   4106   2688   2094 
##     G4     G5 
##   1712   1568
# Data management for missing observations
loan_data$mths_since_last_delinq[is.na(loan_data$mths_since_last_delinq)] <- 0
loan_data$mths_since_last_record[is.na(loan_data$mths_since_last_record)] <- 0
var.has.na <- lapply(loan_data, function(x){any(is.na(x))})
#gouti1454- "which" here brings all the na values 
num_na <- which( var.has.na == TRUE )   #gouti1454- "which" is the filter funtion in like excel. More likely to be a Pivot table conductions. 
#gouti1454 trying to find percentage of NA; "per_na".
per_na <- num_na/dim(loan_data)[1] 
loan_data <- loan_data[complete.cases(loan_data),]
# Visualization of the data
# Bar chart of the loan amount
#gouti1454 ploting graph -loan amount vs nos of loans.  
loanamount_barchart <- ggplot(data=loan_data, aes(loan_data$loan_amnt)) + 
  geom_histogram(breaks=seq(0, 35000, by=1000), 
                 col="black", aes(fill=..count..)) +
  scale_fill_gradient("Count", low="green1", high="yellowgreen")+
  labs(title="Loan Amount", x="Amount", y="Number of Loans")
loanamount_barchart

ggplotly(p = ggplot2::last_plot())
# Box plot of loan amount
#gouti1454 below lines will give the box plots on loan status vs loan amounts. 
# gouti1454 you can find the outlayers also.
box_plot_stat <- ggplot(loan_data, aes(loan_status, loan_amnt))
box_plot_stat + geom_boxplot(aes(fill = loan_status)) +
  theme(axis.text.x = element_blank()) +
  labs(list(title = "Loan amount by status", x = "Loan Status", y = "Amount"))

ggplotly(p = ggplot2::last_plot())
#gouti1454 Skimming the data here to understand on the variables and detials 
library(skimr)
skim(loan_data)
Data summary
Name loan_data
Number of rows 2257158
Number of columns 23
_______________________
Column type frequency:
factor 10
numeric 13
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
grade 0 1 FALSE 7 B: 662667, C: 648977, A: 432543, D: 323723
sub_grade 0 1 FALSE 35 C1: 145672, B5: 140095, B4: 139604, B3: 131349
term 0 1 FALSE 2 36: 1607378, 60: 649780
issue_d 0 1 FALSE 139 Mar: 61947, Oct: 48619, Oct: 46151, May: 46138
loan_status 0 1 FALSE 9 Ful: 1041059, Cur: 917435, Cha: 261425, Lat: 21838
emp_length 0 1 FALSE 12 10+: 747409, 2 y: 203521, < 1: 189715, 3 y: 180628
home_ownership 0 1 FALSE 6 MOR: 1109487, REN: 893755, OWN: 252691, ANY: 995
verification_status 0 1 FALSE 3 Sou: 885491, Not: 742837, Ver: 628830
purpose 0 1 FALSE 14 deb: 1276134, cre: 516456, hom: 150100, oth: 138994
addr_state 0 1 FALSE 51 CA: 314071, NY: 186150, TX: 186013, FL: 161784

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
loan_amnt 0 1 15045.82 9186.89 500.00 8000.00 12900.00 20000.00 40000.00 ▆▇▅▂▂
annual_inc 0 1 78042.29 112711.78 0.00 46000.00 65000.00 93000.00 110000000.00 ▇▁▁▁▁
dti 0 1 18.83 14.16 -1.00 11.90 17.84 24.49 999.00 ▇▁▁▁▁
delinq_2yrs 0 1 0.31 0.87 0.00 0.00 0.00 0.00 58.00 ▇▁▁▁▁
int_rate 0 1 13.09 4.83 5.31 9.49 12.62 15.99 30.99 ▆▇▃▁▁
inq_last_6mths 0 1 0.58 0.89 0.00 0.00 0.00 1.00 33.00 ▇▁▁▁▁
mths_since_last_delinq 0 1 16.84 23.07 0.00 0.00 0.00 30.00 226.00 ▇▂▁▁▁
mths_since_last_record 0 1 11.50 28.47 0.00 0.00 0.00 0.00 129.00 ▇▁▁▁▁
open_acc 0 1 11.62 5.64 1.00 8.00 11.00 14.00 101.00 ▇▁▁▁▁
pub_rec 0 1 0.20 0.57 0.00 0.00 0.00 0.00 86.00 ▇▁▁▁▁
revol_bal 0 1 16664.01 22904.14 0.00 5960.00 11332.00 20252.00 2904836.00 ▇▁▁▁▁
revol_util 0 1 50.34 24.71 0.00 31.50 50.30 69.40 892.30 ▇▁▁▁▁
total_acc 0 1 24.17 11.99 1.00 15.00 22.00 31.00 176.00 ▇▁▁▁▁
# Focus on the historical loans
#gouti1454 Here same variable is reassigned with the new condition.
# gouti1454 here the loan status which is current is removed.
#gouti1454 "whcih" command can also be used here
loan_data=as.data.frame(loan_data[loan_data$loan_status!="Current", ]) #gouti1454 not equal to current. 
#gouti1454 gives the details in each % categorey, defining percentail income here.
limits_inc = quantile(loan_data$annual_inc, seq(0,1,0.1))
labels <- c(0, limits_inc[2:10], "+inf")
labels <- prettyNum(labels, big.mark = ",")
#gouti1454 setting range 
labels <- paste(labels[1:10], labels[2:11], sep = "-")

loan_data$annual_inc <- cut(loan_data$annual_inc, limits_inc, labels = labels, include.lowest = T)
loan_data[,"annual_inc"] <- as.character(loan_data[,"annual_inc"])
# Create binary variables for the logistic regression analysis
# Annual_inc
loan_data$annual_inc[loan_data$annual_inc == "70,000- 80,000"| loan_data$annual_inc == "80,000- 94,000" | loan_data$annual_inc == "94,000-120,000" | loan_data$annual_inc == "120,000-   +inf" ] <- 1
#gouti1454 now setting loan_data which is not equal as zero
loan_data$annual_inc[loan_data$annual_inc != 1] <- 0
loan_data$annual_inc <- as.numeric(loan_data$annual_inc)
# Home_ownership
loan_data$home_ownership <- as.character(loan_data$home_ownership)
loan_data$home_ownership[loan_data$home_ownership=="OWN" | loan_data$home_ownership=="MORTGAGE"  ] <- 1       
loan_data$home_ownership[loan_data$home_ownership!=1] <- 0
# Dealinq_2yrs
loan_data$delinq_2yrs <- as.character(loan_data$delinq_2yrs)
loan_data$delinq_2yrs[loan_data$delinq_2yrs=="0"] <- 0
loan_data$delinq_2yrs[loan_data$delinq_2yrs!= 0] <- 1
# Verification status: if Verified = 1 ; otherwise = 0
loan_data$verification_status = as.character(loan_data$verification_status)
loan_data$verification_status[loan_data$verification_status == "Verified" | loan_data$verification_status == "Source Verified"] = 1
loan_data$verification_status[loan_data$verification_status != 1] = 0
loan_data$verification_status=as.numeric(loan_data$verification_status)
# Dti
dti_quant <- quantile(loan_data$dti, seq(0, 1, 0.1))
labels = c(0,prettyNum(dti_quant[2:10], big.mark = ","), "+Inf")
labels = paste(labels[1:10],labels[2:11], sep = "-")
loan_data <- mutate(loan_data, dti= cut(loan_data$dti, breaks = dti_quant, labels = factor(labels), include.lowest = T)) #gouti1454 included the lowest values as True
loan_data$dti <- as.character(loan_data$dti)
loan_data$dti[loan_data$dti == "0-6.57" | loan_data$dti == "12.13-14.32" | loan_data$dti == "14.32-16.49" ] <- 1
loan_data$dti[loan_data$dti!=1] <- 0
# Status
loan_data$loan_status <- as.character(loan_data$loan_status)
loan_data$loan_status[loan_data$loan_status == "Charged Off" | loan_data$loan_status == "Default" ] <- 1
loan_data$loan_status[loan_data$loan_status != 1] <- 0
#gouti1454 here we are using the "table" to get the count.
table(loan_data$loan_status)
## 
##       0       1 
## 1078267  261456
#gouti1454 percentage values. this is used in the model
PercTable(loan_data$loan_status)
##                      
##        freq      perc
##                      
## 0 1'078'267     80.5%
## 1   261'456     19.5%
# Change to nummeric variables:
#gouti1454 revol_util is convered to numeric and reassiging to same variable. 
#gouti1454 Fixed= true means, fixing decimel places as 2. 
loan_data[,"revol_util"] <- as.numeric(sub("%", "",loan_data$"revol_util", fixed =TRUE))/100
loan_data[,"int_rate"] <- as.numeric(sub("%", "",loan_data$"int_rate", fixed =TRUE))/100
loan_data$loan_status <- as.numeric(loan_data$loan_status)
# Grouping variables
#gouti1454 now grouping as 2,1,0.
loan_data$purpose <- as.character(loan_data$purpose)
loan_data$purpose[loan_data$purpose == "car" | loan_data$purpose == "major_purchase" | 
                    loan_data$purpose == "home_improvement"| loan_data$purpose == "credit_card" ] <- 2
loan_data$purpose[loan_data$purpose == "moving" | loan_data$purpose == "small_business" | 
                    loan_data$purpose == "renewable_energy" ] <- 0
loan_data$purpose[loan_data$purpose!= 0 & loan_data$purpose!= 2 ] <- 1
loan_data$purpose <- as.factor(loan_data$purpose)
##Machine Learning: Multiple  Logistic Regression Models
# Logistic: Logit stepwise Regression
# gouti1454: have removed "dti" from glm since "dti" has only zero as values
logregmodI <- glm(loan_status ~ loan_amnt + home_ownership + annual_inc
            + verification_status + purpose + delinq_2yrs 
            + int_rate + inq_last_6mths + mths_since_last_delinq 
            + revol_bal + revol_util + total_acc,
            data = loan_data, family = binomial(link= "logit"))
#here "link=logit" means logitic regression, there are other models are there#


step <- stepAIC(logregmodI, direction="both")
## Start:  AIC=1237284
## loan_status ~ loan_amnt + home_ownership + annual_inc + verification_status + 
##     purpose + delinq_2yrs + int_rate + inq_last_6mths + mths_since_last_delinq + 
##     revol_bal + revol_util + total_acc
## 
## 
## Step:  AIC=1237284
## loan_status ~ loan_amnt + home_ownership + verification_status + 
##     purpose + delinq_2yrs + int_rate + inq_last_6mths + mths_since_last_delinq + 
##     revol_bal + revol_util + total_acc
## 
##                          Df Deviance     AIC
## - mths_since_last_delinq  1  1237258 1237282
## <none>                       1237258 1237284
## - total_acc               1  1237302 1237326
## - purpose                 2  1237309 1237331
## - delinq_2yrs             1  1237361 1237385
## - inq_last_6mths          1  1237560 1237584
## - revol_util              1  1237628 1237652
## - revol_bal               1  1237955 1237979
## - loan_amnt               1  1238505 1238529
## - verification_status     1  1238699 1238723
## - home_ownership          1  1241127 1241151
## - int_rate                1  1288616 1288640
## 
## Step:  AIC=1237282
## loan_status ~ loan_amnt + home_ownership + verification_status + 
##     purpose + delinq_2yrs + int_rate + inq_last_6mths + revol_bal + 
##     revol_util + total_acc
## 
##                          Df Deviance     AIC
## <none>                       1237258 1237282
## + mths_since_last_delinq  1  1237258 1237284
## - total_acc               1  1237302 1237324
## - purpose                 2  1237309 1237329
## - delinq_2yrs             1  1237365 1237387
## - inq_last_6mths          1  1237560 1237582
## - revol_util              1  1237628 1237650
## - revol_bal               1  1237957 1237979
## - loan_amnt               1  1238507 1238529
## - verification_status     1  1238699 1238721
## - home_ownership          1  1241132 1241154
## - int_rate                1  1288647 1288669
step$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## loan_status ~ loan_amnt + home_ownership + annual_inc + verification_status + 
##     purpose + delinq_2yrs + int_rate + inq_last_6mths + mths_since_last_delinq + 
##     revol_bal + revol_util + total_acc
## 
## Final Model:
## loan_status ~ loan_amnt + home_ownership + verification_status + 
##     purpose + delinq_2yrs + int_rate + inq_last_6mths + revol_bal + 
##     revol_util + total_acc
## 
## 
##                       Step Df  Deviance Resid. Df Resid. Dev     AIC
## 1                                         1339710    1237258 1237284
## 2             - annual_inc  0 0.0000000   1339710    1237258 1237284
## 3 - mths_since_last_delinq  1 0.5275552   1339711    1237258 1237282
# Create a training- and testing dataset
  percing <- floor((nrow(loan_data)/4)*3)       
  loan <- loan_data[sample(nrow(loan_data)), ]          
  loan.training <- loan[1:percing, ]              
  loan.testing <- loan[(percing+1):nrow(loan), ]
# Begin training of the model
  # gouti1454: have removed "dti" from glm since "dti" has only zero as values
  fitting.logistic <- glm(loan_status ~ loan_amnt + home_ownership + verification_status + 
                   purpose + delinq_2yrs + int_rate + inq_last_6mths + 
                   mths_since_last_delinq + revol_bal + revol_util + total_acc,
                 data=loan.training,family = binomial(link= "logit"))
  summary(fitting.logistic)
## 
## Call:
## glm(formula = loan_status ~ loan_amnt + home_ownership + verification_status + 
##     purpose + delinq_2yrs + int_rate + inq_last_6mths + mths_since_last_delinq + 
##     revol_bal + revol_util + total_acc, family = binomial(link = "logit"), 
##     data = loan.training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6410  -0.6774  -0.5437  -0.4046   3.0287  
## 
## Coefficients:
##                          Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)            -3.166e+00  2.057e-02 -153.908  < 2e-16 ***
## loan_amnt               1.003e-05  3.275e-07   30.626  < 2e-16 ***
## home_ownership1        -2.916e-01  5.473e-03  -53.271  < 2e-16 ***
## verification_status     2.050e-01  6.322e-03   32.419  < 2e-16 ***
## purpose1               -8.674e-02  1.728e-02   -5.019 5.19e-07 ***
## purpose2               -1.086e-01  1.777e-02   -6.112 9.87e-10 ***
## delinq_2yrs1            6.079e-02  6.590e-03    9.224  < 2e-16 ***
## int_rate                1.120e+01  5.768e-02  194.260  < 2e-16 ***
## inq_last_6mths          4.017e-02  2.680e-03   14.991  < 2e-16 ***
## mths_since_last_delinq  8.618e-06  1.154e-04    0.075     0.94    
## revol_bal              -3.657e-06  1.699e-07  -21.521  < 2e-16 ***
## revol_util              1.971e-01  1.168e-02   16.876  < 2e-16 ***
## total_acc               1.551e-03  2.367e-04    6.552 5.69e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 992204  on 1004791  degrees of freedom
## Residual deviance: 928159  on 1004779  degrees of freedom
## AIC: 928185
## 
## Number of Fisher Scoring iterations: 4
# AUC and ROC curve
  fitted.results <- predict(fitting.logistic, newdata = loan.testing, type = "response")
  loan.testing$prob <- fitted.results
  pred <- prediction(loan.testing$prob,loan.testing$loan_status)
  auc1 <- performance(pred, measure = "auc")
  auc1@y.values
## [[1]]
## [1] 0.6823697
# Performance function
ROCRperf = performance(pred, "tpr", "fpr")
# Plot the ROC graph Add threshold labels 
plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))
abline(0, 1, col= "black")